import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st
import plotly.express as px
import plotly
import plotly.graph_objs as go
from statsmodels.stats.outliers_influence import variance_inflation_factor
# pour afficher dynamiquement dans le notebook
from IPython.display import clear_output
from IPython.display import display, Math, Markdown
import time
data = [['Europe & Central Asia', 0.4], ['Latin America & Caribbean', 0.66],
['South Asia', 0.5], ["Sub-Saharan Africa ", 0.66], ["North America", 0.4],
["East Asia & Pacific", 0.5], ["Middle East & North Africa", 0.66]]
elast_region = pd.DataFrame(data, columns = ['region', 'elasticite'])
elast_region
| region | elasticite | |
|---|---|---|
| 0 | Europe & Central Asia | 0.40 |
| 1 | Latin America & Caribbean | 0.66 |
| 2 | South Asia | 0.50 |
| 3 | Sub-Saharan Africa | 0.66 |
| 4 | North America | 0.40 |
| 5 | East Asia & Pacific | 0.50 |
| 6 | Middle East & North Africa | 0.66 |
elast_pays = pd.read_csv("elasticite_pays.csv")
elast_pays = elast_pays[["iso3","region",'IGEincome']]
elast_pays = elast_pays.dropna()
elast_pays = elast_pays.drop_duplicates()
elast_pays
| iso3 | region | IGEincome | |
|---|---|---|---|
| 60 | ALB | Europe & Central Asia | 0.815874 |
| 183 | AUS | High income | 0.275000 |
| 243 | AUT | High income | 0.245267 |
| 351 | BEL | High income | 0.183176 |
| 405 | BEN | Sub-Saharan Africa | 0.855116 |
| ... | ... | ... | ... |
| 6202 | USA | High income | 0.537666 |
| 6274 | UZB | Europe & Central Asia | 0.496764 |
| 6330 | VNM | East Asia & Pacific | 0.480000 |
| 6426 | ZAF | Sub-Saharan Africa | 0.677000 |
| 6480 | COD | Sub-Saharan Africa | 0.707703 |
75 rows × 3 columns
elast_pays.sample(5)
| iso3 | region | IGEincome | |
|---|---|---|---|
| 6202 | USA | High income | 0.537666 |
| 4802 | PER | Latin America & Caribbean | 0.667000 |
| 2968 | JPN | High income | 0.340000 |
| 2112 | GHA | Sub-Saharan Africa | 0.561605 |
| 3556 | LUX | High income | 0.380792 |
pays_ref = pd.read_csv('pays_ref.csv')
pays_ref.set_index(pays_ref['iso3'], inplace=True)
pays_ref.describe()
| iso3 | region | |
|---|---|---|
| count | 116 | 116 |
| unique | 116 | 5 |
| top | TUR | latin_america_africa |
| freq | 1 | 41 |
dist_revenus = pd.read_csv("data_projet7.csv", decimal =",")
dist_revenus.describe()
| year_survey | quantile | nb_quantiles | income | gdpppp | |
|---|---|---|---|---|---|
| count | 11599.000000 | 11599.000000 | 11599.0 | 11599.000000 | 1.139900e+04 |
| mean | 2007.982757 | 50.500819 | 100.0 | 6069.224260 | 5.022128e+04 |
| std | 0.909633 | 28.868424 | 0.0 | 9414.185972 | 4.000688e+05 |
| min | 2004.000000 | 1.000000 | 100.0 | 16.719418 | 3.031931e+02 |
| 25% | 2008.000000 | 25.500000 | 100.0 | 900.685515 | 2.576000e+03 |
| 50% | 2008.000000 | 51.000000 | 100.0 | 2403.244900 | 7.560000e+03 |
| 75% | 2008.000000 | 75.500000 | 100.0 | 7515.420900 | 1.877300e+04 |
| max | 2011.000000 | 100.000000 | 100.0 | 176928.550000 | 4.300332e+06 |
print("Nombre de lignes : %d" %len(dist_revenus))
print("Nombre de pays : %s" %len(dist_revenus["country"].unique()))
print("Or 116 * 100 =", 116*100, "il manque vraisemblablement ", (116*100 -len(dist_revenus)), "quantile.")
Nombre de lignes : 11599 Nombre de pays : 116 Or 116 * 100 = 11600 il manque vraisemblablement 1 quantile.
#on cherche pour quel pays le total des quantiles ne fait pas 100
pays_nb_quantiles = dist_revenus[['country', 'quantile']].groupby('country').count()
pays_nb_quantiles[pays_nb_quantiles['quantile'] != 100]
| quantile | |
|---|---|
| country | |
| LTU | 99 |
ltu_quantiles = dist_revenus.loc[dist_revenus['country'] == 'LTU', 'quantile'].values
#On isole les valeurs
for i in range(1,101):
if i not in ltu_quantiles: print(i)
#On détecte quel est le quantile manquant dans les valeurs isolées : 41
41
#On affiche les infos des quantiles 40 et 42 pour calculer la moyenne et l'insérer entre les deux
dist_revenus.loc[(dist_revenus['country'] == 'LTU') & (dist_revenus['quantile'].isin([40,42]))]
| country | year_survey | quantile | nb_quantiles | income | gdpppp | |
|---|---|---|---|---|---|---|
| 6239 | LTU | 2008 | 40 | 100 | 4868.4507 | 17571.0 |
| 6240 | LTU | 2008 | 42 | 100 | 4895.8306 | 17571.0 |
q40_income = dist_revenus.loc[
(dist_revenus['country'] == 'LTU') &
(dist_revenus['quantile'] == 40), 'income'
].values[0]
q42_income = dist_revenus.loc[
(dist_revenus['country'] == 'LTU') &
(dist_revenus['quantile'] == 42), 'income'
].values[0]
dist_revenus.loc['6239a'] = ['LTU', 2008, 41, 100, (q40_income + q42_income) / 2, 17571.0]
pays_nb_quantiles = dist_revenus[['country', 'quantile']].groupby('country').count()
pays_nb_quantiles[pays_nb_quantiles['quantile'] != 100]
# On relance et on a 0 pays avec des quantiles manquants
| quantile | |
|---|---|
| country |
dist_revenus.describe()
#On a vraisemblablement une valeur max de gdpppp fausse car = 4 300 000
| year_survey | quantile | nb_quantiles | income | gdpppp | |
|---|---|---|---|---|---|
| count | 11600.000000 | 11600.000000 | 11600.0 | 11600.000000 | 1.140000e+04 |
| mean | 2007.982759 | 50.500000 | 100.0 | 6069.121925 | 5.021841e+04 |
| std | 0.909593 | 28.867314 | 0.0 | 9413.786596 | 4.000513e+05 |
| min | 2004.000000 | 1.000000 | 100.0 | 16.719418 | 3.031931e+02 |
| 25% | 2008.000000 | 25.750000 | 100.0 | 900.768507 | 2.576000e+03 |
| 50% | 2008.000000 | 50.500000 | 100.0 | 2403.492950 | 7.709000e+03 |
| 75% | 2008.000000 | 75.250000 | 100.0 | 7515.313700 | 1.877300e+04 |
| max | 2011.000000 | 100.000000 | 100.0 | 176928.550000 | 4.300332e+06 |
max_value = max(dist_revenus["gdpppp"])
print('Maximum value:', max_value)
#Vérification
Maximum value: 4300332.0
fig = px.box(dist_revenus, y="gdpppp", width=300, height=400)
fig.show()
#Vérification graphique : un outlier
out = dist_revenus.loc[dist_revenus["gdpppp"] > 500000]
print(out["country"].unique())
#On isole l'outlier dans un df séparé et on affiche son nom : FIJI !
['FJI']
dist_revenus.isna().sum()
# 200 observations manquantes pour le gdpppp
country 0 year_survey 0 quantile 0 nb_quantiles 0 income 0 gdpppp 200 dtype: int64
na = dist_revenus[dist_revenus.isna().any(axis=1)]
na.country.unique()
#Il manque les données pour XKX (Kosovo) et PSE (Palestine)
array(['XKX', 'PSE'], dtype=object)
On doit donc remplacer ou supprimer les données pour ces 3 pays : on peut les trouver et les insérer -> banque mondiale (https://data.worldbank.org/indicator/NY.GDP.PCAP.PP.CD?locations=PS)
dist_revenus.loc[dist_revenus['country'] == 'FJI', 'gdpppp'] = 7777
dist_revenus.loc[dist_revenus['country'] == 'XKX', 'gdpppp'] = 6539
dist_revenus.loc[dist_revenus['country'] == 'PSE', 'gdpppp'] = 3709
gdp_ppp = dist_revenus[['country', 'gdpppp']].drop_duplicates()
gdp_ppp.set_index('country', inplace=True)
gdp_ppp.sample(5)
| gdpppp | |
|---|---|
| country | |
| POL | 16436.0000 |
| NLD | 38065.0000 |
| SWZ | 4748.0000 |
| MRT | 2226.7344 |
| SLV | 6270.0000 |
gdp_ppp.describe()
| gdpppp | |
|---|---|
| count | 116.000000 |
| mean | 12436.139644 |
| std | 13153.603367 |
| min | 303.193050 |
| 25% | 2577.500000 |
| 50% | 7532.500000 |
| 75% | 17679.250000 |
| max | 73127.000000 |
import pandas_datareader as pdr
from pandas_datareader import wb
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) <ipython-input-23-6959e85f153d> in <module> ----> 1 import pandas_datareader as pdr 2 from pandas_datareader import wb ModuleNotFoundError: No module named 'pandas_datareader'
matches = wb.search('gini')
print(matches["id"].unique())
#On cherche l'indice de gini : SI.POV.GINI
gini = wb.download(indicator='SI.POV.GINI', country="all", start=1998, end=2008)#.reset_index()
gini = gini.reset_index()
gini.head()
#On le télécharge pour la période 98-2008 sur tous les pays
gini.sample(5)
gini_2 = pdr.wb.WorldBankReader(symbols='SI.POV.GINI', countries="all", start=1998, end=2008,freq='A', retry_count=3, pause=0.1, session=None, errors='warn').get_countries()
gini_2 = gini_2.rename(columns={"name": "country"})
gini_2.head()
#On récupère les infos pays de la banque mondiale sur la même période, pour tous les pays
gini_3 = pd.merge(left = gini, right = gini_2, on = "country")
#On merge les deux df sur la colonne pays
gini_4 = gini_3.drop(gini_3[gini_3.lendingType == "Aggregates"].index, inplace=False)
#On supprime les données agrégées (ex : les provinces de Chine)
gini_4.head()
gini_4.isna().sum()
#Beaucoup de valeurs manquantes pour l'indice de gini mais on s'y attendait : on va calculer la moyenne sur la période et l'utiliser comme indice de référence
gini_pivot = gini_4.pivot(index = "country", columns='year',values='SI.POV.GINI')
gini_pivot["moy_gini"] = gini_pivot.mean(numeric_only=False, axis=1)
gini_pivot.moy_gini.isna().sum()
#On a 70 valeurs manquantes sur 210 pays on garde 140 observations ce qui est plutôt bon !
gini_ok = gini_pivot[gini_pivot['moy_gini'].notna()].reset_index()
gini_ok = gini_ok[["country",'moy_gini']]
gini_ok.head()
gini_ok.sample(5)
#On récupère la population totale : SP.POP.TOTL
pop_tot = wb.download(indicator='SP.POP.TOTL', country="all", start=2012, end=2012).reset_index()
pop_tot.head()
#Même opération que pour l'indice de gini, on merge les deux df, on ne garde que les non agrégats
pop_tot_2 = pd.merge(left = pop_tot, right = gini_2, on = "country")
pop_tot_3 = pop_tot_2.drop(pop_tot_2[pop_tot_2.lendingType == "Aggregates"].index, inplace=False)
pop_tot_3.head()
#On va également stocker la valeur de population totale pour plus tard
pop_agg = pop_tot.loc[pop_tot["country"] == "World"]
pop_mond_2012 = pop_agg.iloc[0]['SP.POP.TOTL']
print("En 2012, le monde comptait", round(pop_mond_2012), "habitants.")
pop_agg
pop_tot_3.sample(5)
#Merge des élasticités pays sur le dataset population
pop_test = pop_tot_3.rename(columns = {"iso3c" : "iso3"})
merge_test = pd.merge(left = pop_test, right = elast_pays, on = "iso3", how="outer")
merge_test = merge_test.rename(columns = {'region_x' : "region"})
merge_test = merge_test.drop(merge_test.tail(1).index ,inplace=False)
#On renomme les colonnes sources d'erreurs
merge_test = merge_test.replace("Latin America & Caribbean ", "Latin America and Caribbean")
elast_region =elast_region.replace("Latin America & Caribbean", "Latin America and Caribbean")
#Merge des élasticités régionales sur le dataset
merge_test_2 = pd.merge(left = merge_test, right = elast_region, on = 'region', how = 'outer')
#Calcul de la nouvelle variable élast pays sinon région :
merge_test_2['IGEincome'] = merge_test_2['IGEincome'].fillna(0)
merge_test_2["elasticite_ok"] = merge_test_2[["IGEincome", "elasticite"]].sum(axis = 1).where(merge_test_2["IGEincome"] == 0, merge_test_2["IGEincome"])
## --> manque plus qu'à nettoyer les colonnes
for col in merge_test_2.columns:
print(col)
pop_tot_4 = merge_test_2[["country", "year", "SP.POP.TOTL", "iso3", "region",
"adminregion", "incomeLevel", "lendingType", "capitalCity",
"longitude", "latitude", "elasticite_ok"]]
pop_tot_4
On peut réunir gini, gdp/PPP & population
gini_merge = gini_ok.reset_index()
gini_merge = gini_merge[["country",'moy_gini']]
gini_merge = gini_merge.rename(columns={"country" : "country_name"})
gini_merge.head()
gdp_ppp_merge = gdp_ppp.reset_index()
gdp_ppp_merge.head()
pop_tot_4
#pop_merge = pop_tot_4[["country", "SP.POP.TOTL", "iso3c", "region", "incomeLevel", "capitalCity", "longitude", "latitude"]]
pop_merge = pop_tot_4
pop_merge = pop_merge.rename(columns={"country" : "country_name", "iso3c" : "country"})
pop_merge.head()
gdp_ppp_merge = gdp_ppp_merge.rename(columns={"country" : "iso3"})
data = pd.merge(left = pop_merge, right = gdp_ppp_merge, on = "iso3")
data
gini_merge
data_2 = pd.merge(left = data, right = gini_merge, on = "country_name")
data_2
data_2 = data_2[["country_name", "SP.POP.TOTL", "iso3", "region", "incomeLevel",
"elasticite_ok", "gdpppp", "moy_gini", "longitude", "latitude"]]
data_2 = data_2.rename(columns = {"country_name" : "country", "SP.POP.TOTL" : "poptot"})
data_2.info()
#On a 108 pays dans le dataset
print("L'indicateur agrégé de gini couvre les années 1998 à 2008")
print("Les données de revenus & PIB couvrent les années", dist_revenus.year_survey.unique())
print("Les données de population portent sur l'année", pop_tot.iloc[0]['year'])
pop_tot_data = data_2.poptot.sum()
print("Le jeu de données couvre", round((pop_tot_data/pop_mond_2012)*100, 2), "% de la population mondiale en 2012")
x = data_2.country.count()
print("Le jeu de données couvre",x , "pays, soit", round((x/195)*100, 2) ,"% des pays reconus par l'ONU")
data_2
data_ok = data_2
gdp_ppp_ok = gdp_ppp
dist_revenus_ok = dist_revenus[["country", "year_survey", "quantile", "income"]]
data_ok.to_csv("données/data_ok.csv") gdp_ppp_ok.to_csv("données/gdp_ppp_ok.csv") dist_revenus_ok.to_csv("données/dist_revenus_ok.csv")
gini_4.to_csv("données/gini_4.csv") gini_ok.to_csv("données/gini_ok.csv")
data_ok = pd.read_csv("données/data_ok.csv")
gdp_ppp_ok = pd.read_csv("données/gdp_ppp_ok.csv")
dist_revenus_ok = pd.read_csv("données/dist_revenus_ok.csv")
gini_ok = pd.read_csv("données/gini_ok.csv")
gini_4 = pd.read_csv("données/gini_4.csv")
df = dist_revenus_ok.loc[dist_revenus_ok.country == 'FRA' ]
fig = px.scatter(df,
x="quantile", y="income",
labels={'quantile':'Centile', 'income':'Revenus en dollars'},
title = "Distribution des revenus en France par centiles")
fig.show()
fig = px.scatter(data_ok,
x="gdpppp",
y="moy_gini",
labels={'gdpppp':'PIB/PPA',
'moy_gini':'Indice de Gini'},
color="incomeLevel")
fig.show()
fig = px.histogram(data_ok, x="moy_gini", nbins=10, labels={'moy_gini':'Indice de Gini'}, title = "Distribution des indices de gini")
fig.show()
cor = data_ok[['poptot', 'gdpppp', 'moy_gini', 'incomeLevel']].corr()
print(cor)
fig = px.imshow(cor)
fig.show()
poptot gdpppp moy_gini poptot 1.000000 -0.072635 0.003686 gdpppp -0.072635 1.000000 -0.413425 moy_gini 0.003686 -0.413425 1.000000
from sklearn.datasets import load_iris
from scipy.stats import spearmanr
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.stats.multitest import multipletests
iris_df = data_ok[['poptot', 'gdpppp', 'moy_gini', 'incomeLevel']]
def get_correlations(df):
df = df.dropna()._get_numeric_data()
dfcols = pd.DataFrame(columns=df.columns)
pvalues = dfcols.transpose().join(dfcols, how="outer")
correlations = dfcols.transpose().join(dfcols, how="outer")
for ix, r in enumerate(df.columns):
for jx, c in enumerate(df.columns):
sp = spearmanr(df[r], df[c])
correlations[c][r] = sp[0]
pvalues[c][r] = sp[1] if ix > jx else np.nan # Only store values below the diagonal
return correlations.astype("float"), pvalues.astype("float")
correlations, uncorrected_p_values = get_correlations(iris_df)
# Correct p-values for multiple testing and check significance (True if the corrected p-value < 0.05)
shape = uncorrected_p_values.values.shape
significant_matrix = multipletests(uncorrected_p_values.values.flatten())[0].reshape(
shape
)
# Here we start plotting
g = sns.clustermap(correlations, cmap="vlag", vmin=-1, vmax=1)
# Here labels on the y-axis are rotated
for tick in g.ax_heatmap.get_yticklabels():
tick.set_rotation(0)
# Here we add asterisks onto cells with signficant correlations
for i, ix in enumerate(g.dendrogram_row.reordered_ind):
for j, jx in enumerate(g.dendrogram_row.reordered_ind):
if i != j:
text = g.ax_heatmap.text(
j + 0.5,
i + 0.5,
"*" if significant_matrix[ix, jx] or significant_matrix[jx, ix] else "",
ha="center",
va="center",
color="black",
)
text.set_fontsize(20)
df = data_ok[['country', 'incomeLevel']].groupby('incomeLevel').count().reset_index()
fig = px.pie(df, values='country', names='incomeLevel', title='Répartition des niveaux de revenus')
fig.show()
dist_rev_moy = dist_revenus_ok.groupby(by = "country").mean()
dist_rev_moy.sort_values("income").head()
| year_survey | quantile | income | |
|---|---|---|---|
| country | |||
| COD | 2008.0 | 50.5 | 276.016044 |
| MDG | 2010.0 | 50.5 | 345.237074 |
| CIV | 2008.0 | 50.5 | 399.835204 |
| KEN | 2007.0 | 50.5 | 519.320032 |
| SWZ | 2009.0 | 50.5 | 530.283828 |
dist_rev_moy.describe()
| year_survey | quantile | income | |
|---|---|---|---|
| count | 116.000000 | 116.0 | 116.000000 |
| mean | 2007.982759 | 50.5 | 6069.121925 |
| std | 0.913500 | 0.0 | 6660.966925 |
| min | 2004.000000 | 50.5 | 276.016044 |
| 25% | 2008.000000 | 50.5 | 1374.270126 |
| 50% | 2008.000000 | 50.5 | 3287.174692 |
| 75% | 2008.000000 | 50.5 | 7077.900152 |
| max | 2011.000000 | 50.5 | 26888.511518 |
for col in data_ok.columns:
print(col)
Unnamed: 0 country poptot iso3 region incomeLevel elasticite_ok gdpppp moy_gini longitude latitude
fig = px.box(data_ok,
x="elasticite_ok",
y="incomeLevel",
color = "incomeLevel"
)
fig.show()
fig = px.scatter(data_ok,
x="elasticite_ok",
y="moy_gini",
size="gdpppp",
color="region", hover_name="country", log_x=True, size_max=60)
fig.show()
fig = px.scatter_geo(data_ok, locations="iso3", color="incomeLevel",
hover_name="country", size="gdpppp", size_max = 50,
projection="natural earth")
fig.show()